get_upper_tri = function(cormat){
    cormat[lower.tri(cormat)] = NA
    diag(cormat) = NA
    return(cormat)
}

hard_thresh = function(R, th){
  R_th = R
  R_th[abs(R) <= th] = 0
  return(R_th)
}

1. Data import

Generate phyloseq objects

# The Nomic data are not publicly available

# Metadata
meta_data = read_csv("../data/nomic/nomic_meta.csv")

# Taxonomy
tax = read_tsv("../data/nomic/nomic_taxnomy.txt") %>% 
  arrange(otu_id)
otu_id = tax$otu_id
tax = data.frame(tax[, -1]) %>%
  rowwise() %>%
  dplyr::mutate_all(function(x) strsplit(x, "__")[[1]][2]) %>%
  mutate(
    species = ifelse(!is.na(species) & !is.na(genus),
                     paste(strsplit(genus, "")[[1]][1], species, sep = "."),
                     NA)) %>%
  ungroup()
tax = as.matrix(tax)
rownames(tax) = otu_id

# Day 30

# OTU table
otu_table = read_tsv("../data/nomic/nomic_otu_30.txt")
sample_id = otu_table$`#SampleID`
otu_id = colnames(otu_table)[-1]
otu_table = t(otu_table[, -1])
otu_table = data.frame(otu_table)
rownames(otu_table) = otu_id
colnames(otu_table) = sample_id
otu_table = as.matrix(otu_table)

# OTU object
OTU = otu_table(otu_table, taxa_are_rows = TRUE)
META = sample_data(meta_data)
sample_names(META) = meta_data$`#SampleID`
TAX = tax_table(tax)
otu_data30 = phyloseq(OTU, TAX, META)

# Aggregate to phylum level
phylum_data30 = aggregate_taxa(otu_data30, "phylum")
phylum_data30 = subset_taxa(phylum_data30, phylum != "Unknown")

# Aggregate to family level
family_data30 = aggregate_taxa(otu_data30, "family")
family_data30 = subset_taxa(family_data30, family != "Unknown")

# Day 120 

# OTU table
otu_table = read_tsv("../data/nomic/nomic_otu_120.txt") 
sample_id = otu_table$`#SampleID`
otu_id = colnames(otu_table)[-1]
otu_table = t(otu_table[, -1])
otu_table = data.frame(otu_table)
rownames(otu_table) = otu_id
colnames(otu_table) = sample_id
otu_table = as.matrix(otu_table)

# OTU object
OTU = otu_table(otu_table, taxa_are_rows = TRUE)
META = sample_data(meta_data)
sample_names(META) = meta_data$`#SampleID`
TAX = tax_table(tax)
otu_data120 = phyloseq(OTU, TAX, META)

# Aggregate to phylum level
phylum_data120 = aggregate_taxa(otu_data120, "phylum")
phylum_data120 = subset_taxa(phylum_data120, phylum != "Unknown")

# Aggregate to family level
family_data120 = aggregate_taxa(otu_data120, "family")
family_data120 = subset_taxa(family_data120, family != "Unknown")

# Day 365 

# OTU table
otu_table = read_tsv("../data/nomic/nomic_otu_365.txt") 
sample_id = otu_table$`#SampleID`
otu_id = colnames(otu_table)[-1]
otu_table = t(otu_table[, -1])
otu_table = data.frame(otu_table)
rownames(otu_table) = otu_id
colnames(otu_table) = sample_id
otu_table = as.matrix(otu_table)

# OTU object
OTU = otu_table(otu_table, taxa_are_rows = TRUE)
META = sample_data(meta_data)
sample_names(META) = meta_data$`#SampleID`
TAX = tax_table(tax)
otu_data365 = phyloseq(OTU, TAX, META)

# Aggregate to phylum level
phylum_data365 = aggregate_taxa(otu_data365, "phylum")
phylum_data365 = subset_taxa(phylum_data365, phylum != "Unknown")

# Aggregate to family level
family_data365 = aggregate_taxa(otu_data365, "family")
family_data365 = subset_taxa(family_data365, family != "Unknown")

Quality check

# Sample size
df_sample = data.frame(day30 = nsamples(otu_data30), 
                       day120 = nsamples(otu_data120), 
                       day365 = nsamples(otu_data365))

# Find the most abundant taxa across time
top_phylum30 = top_taxa(phylum_data30, n = 50)
top_phylum120 = top_taxa(phylum_data120, n = 50)
top_phylum365 = top_taxa(phylum_data365, n = 50)

top_family30 = top_taxa(family_data30, n = 50)
top_family120 = top_taxa(family_data120, n = 50)
top_family365 = top_taxa(family_data365, n = 50)

common_phylum = Reduce(intersect, list(day30 = top_phylum30, 
                                       day120 = top_phylum120,
                                       day365 = top_phylum365))
common_phylum = common_phylum[!is.na(common_phylum)]
top_phylum = sort(common_phylum[1:10])

common_family = Reduce(intersect, list(day30 = top_family30, 
                                       day120 = top_family120,
                                       day365 = top_family365))
common_family = common_family[!is.na(common_family)]
top_family = sort(common_family[1:10])

# top_phylum = c("Acidobacteria", "Actinobacteria", "Bacteroidetes", "Cyanobacteria", "Firmicutes", 
#                "Fusobacteria", "Proteobacteria", "Tenericutes", "TM7", "Verrucomicrobia")
# top_family = c("Bacteroidaceae", "Bifidobacteriaceae", "Clostridiaceae", "Enterobacteriaceae", "Fusobacteriaceae",
#                "Lachnospiraceae", "Moraxellaceae", "Pasteurellaceae", "Ruminococcaceae", "Staphylococcaceae")

# Prevalence
# Phylum
qc_phylum = data.frame(matrix(NA, nrow = 10, ncol = 4))
colnames(qc_phylum) = c("phylum", "day30", "day120", "day365")

core_phylum30 = subset_taxa(phylum_data30, phylum %in% top_phylum)
core_phylum120 = subset_taxa(phylum_data120, phylum %in% top_phylum)
core_phylum365 = subset_taxa(phylum_data365, phylum %in% top_phylum)
qc_phylum$phylum = taxa_names(core_phylum30)
qc_phylum$day30 = apply(abundances(core_phylum30), 1, function(x) sum(x != 0)/nsamples(core_phylum30))
qc_phylum$day120 = apply(abundances(core_phylum120), 1, function(x) sum(x != 0)/nsamples(core_phylum120))
qc_phylum$day365 = apply(abundances(core_phylum365), 1, function(x) sum(x != 0)/nsamples(core_phylum365))

# Family
qc_family = data.frame(matrix(NA, nrow = 10, ncol = 4))
colnames(qc_family) = c("family", "day30", "day120", "day365")

core_family30 = subset_taxa(family_data30, family %in% top_family)
core_family120 = subset_taxa(family_data120, family %in% top_family)
core_family365 = subset_taxa(family_data365, family %in% top_family)
qc_family$family = taxa_names(core_family30)
qc_family$day30 = apply(abundances(core_family30), 1, function(x) sum(x != 0)/nsamples(core_family30))
qc_family$day120 = apply(abundances(core_family120), 1, function(x) sum(x != 0)/nsamples(core_family120))
qc_family$day365 = apply(abundances(core_family365), 1, function(x) sum(x != 0)/nsamples(core_family365))

# Outputs 
saveRDS(df_sample, file = "../data/nomic/qc/df_sample.rds")
saveRDS(df_sample, file = "../data/nomic/qc/qc_phylum.rds")
saveRDS(df_sample, file = "../data/nomic/qc/qc_family.rds")

2. Data summary

Sample size

qc_sample = read_rds("../data/nomic/qc/df_sample.rds")
colnames(qc_sample) = c("Day 30", "Day 120", "Day 365")

datatable(qc_sample)

Phylum prevalence

qc_phylum = read_rds("../data/nomic/qc/qc_phylum.rds") %>%
  mutate_if(is.numeric, function(x) round(x, 2)) %>%
  arrange(phylum)
colnames(qc_phylum) = c("Phylum", "Day 30", "Day 120", "Day 365")

datatable(qc_phylum)

Family prevalence

qc_family = read_rds("../data/nomic/qc/qc_family.rds") %>%
  mutate_if(is.numeric, function(x) round(x, 2)) %>%
  arrange(family)
colnames(qc_family) = c("Family", "Day 30", "Day 120", "Day 365")

datatable(qc_family)

3. SECOM results: phylum level

Run SECOM

pseqs = list(data1 = c(otu_data30, core_phylum30),
             data2 = c(otu_data120, core_phylum120),
             data3 = c(otu_data365, core_phylum365))
pseudo = 0; prv_cut = 0.1; lib_cut = 0; corr_cut = 0.5
wins_quant = c(0.05, 0.95); method = "pearson"; soft = FALSE; thresh_len = 20
n_cv = 10; thresh_hard = 0; max_p_linear = 0.1; n_cl = 2

set.seed(123)
res_linear = secom_linear(pseqs, pseudo, prv_cut, lib_cut, corr_cut, 
                          wins_quant, method, soft, thresh_len, n_cv, 
                          thresh_hard, max_p_linear, n_cl)

R = 1000; max_p_dist = 0.1
set.seed(123)
res_dist = secom_dist(pseqs, pseudo, prv_cut, lib_cut, corr_cut, 
                      wins_quant, R, thresh_hard, max_p_dist, n_cl)

saveRDS(res_linear, file = "../data/nomic/phylum/res_linear.rds")
saveRDS(res_dist, file = "../data/nomic/phylum/res_dist.rds")

Get bias-corrected abundances

# Day 30
# Bias
s_30 = res_linear$s_diff_hat$data1

# Raw abundances
raw_30 = abundances(phylum_data30)
raw_30 = raw_30[, names(s_30)]

# Corrected abundances
o = log(raw_30)
o[is.infinite(o)] = NA
d = nrow(o)
y_hat_30 = o - rowMeans(o, na.rm = TRUE) - t(replicate(d, s_30))
y_hat_30 = exp(y_hat_30)

# Day 120
# Bias
s_120 = res_linear$s_diff_hat$data2

# Raw abundances
raw_120 = abundances(phylum_data120)
raw_120 = raw_120[, names(s_120)]

# Corrected abundances
o = log(raw_120)
o[is.infinite(o)] = NA
d = nrow(o)
y_hat_120 = o - rowMeans(o, na.rm = TRUE) - t(replicate(d, s_120))
y_hat_120 = exp(y_hat_120)

# Day 365
# Bias
s_365 = res_linear$s_diff_hat$data3

# Raw abundances
raw_365 = abundances(phylum_data365)
raw_365 = raw_365[, names(s_365)]

# Corrected abundances
o = log(raw_365)
o[is.infinite(o)] = NA
d = nrow(o)
y_hat_365 = o - rowMeans(o, na.rm = TRUE) - t(replicate(d, s_365))
y_hat_365 = exp(y_hat_365)

phylum_keep = rownames(res_linear$y_hat)
phylum_keep_30 = phylum_keep[grepl("data1", phylum_keep)]
phylum_keep_30 = sort(unique(gsub("data1 - ", "", phylum_keep_30)))
idx = which(rownames(y_hat_30) %in% phylum_keep_30)
y_hat_30 = as.data.frame(y_hat_30[idx, ]) %>%
  bind_rows(as.data.frame(t(data.frame(Others = colSums(y_hat_30[-idx, ], na.rm = TRUE)))))

phylum_keep_120 = phylum_keep[grepl("data2", phylum_keep)]
phylum_keep_120 = sort(unique(gsub("data2 - ", "", phylum_keep_120)))
idx = which(rownames(y_hat_120) %in% phylum_keep_120)
y_hat_120 = as.data.frame(y_hat_120[idx, ]) %>%
  bind_rows(as.data.frame(t(data.frame(Others = colSums(y_hat_120[-idx, ], na.rm = TRUE)))))

phylum_keep_365 = phylum_keep[grepl("data3", phylum_keep)]
phylum_keep_365 = sort(unique(gsub("data3 - ", "", phylum_keep_365)))
idx = which(rownames(y_hat_365) %in% phylum_keep_365)
y_hat_365 = as.data.frame(y_hat_365[idx, ]) %>%
  bind_rows(as.data.frame(t(data.frame(Others = colSums(y_hat_365[-idx, ], na.rm = TRUE)))))

y_hat = y_hat_30 %>%
  rownames_to_column("phylum") %>%
  mutate(day = "30") %>%
  select(phylum, day, everything()) %>%
  bind_rows(
    y_hat_120 %>%
      rownames_to_column("phylum") %>%
      mutate(day = "120") %>%
      select(phylum, day, everything())
  ) %>%
  bind_rows(
    y_hat_365 %>%
      rownames_to_column("phylum") %>%
      mutate(day = "365") %>%
      select(phylum, day, everything())
  )
saveRDS(y_hat, file = "../data/nomic/phylum/y_hat.rds")

Visualization

Correlation coefficients

res_linear = read_rds("../data/nomic/phylum/res_linear.rds")
res_dist = read_rds("../data/nomic/phylum/res_dist.rds")

# Data organization
# Linear relationships
corr_linear = res_linear$corr_fl
cooccur_linear = res_linear$mat_cooccur

# Filter by co-occurrence
overlap = 10
corr_linear[cooccur_linear < overlap] = 0

df_linear = data.frame(get_upper_tri(corr_linear)) %>%
  rownames_to_column("var1") %>%
  pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
  filter(!is.na(value)) %>%
  mutate(var2 = gsub("\\...", " - ", var2),
         value = round(value, 2),
         metric = "Pearson")

# Distance relationships
corr_dist = res_dist$dcorr_fl
cooccur_dist = res_dist$mat_cooccur

# Filter by co-occurrence
overlap = 10
corr_dist[cooccur_dist < overlap] = 0

df_dist = data.frame(get_upper_tri(corr_dist)) %>%
  rownames_to_column("var1") %>%
  pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
  filter(!is.na(value)) %>%
  mutate(var2 = gsub("\\...", " - ", var2),
         value = round(value, 2),
         metric = "Distance")

# Merge datasets
df_corr = df_linear %>%
  bind_rows(
    df_dist
  ) 
phylum_level = sort(union(df_corr$var1, df_corr$var2))
phylum_label = sapply(phylum_level, function(x) strsplit(x, " - ")[[1]][2])
txt_color = case_when(grepl("data1", phylum_level) ~ "#1B9E77",
                      grepl("data2", phylum_level) ~ "#D95F02",
                      TRUE ~ "#7570B3")
df_corr$var1 = factor(df_corr$var1, levels = phylum_level)
df_corr$var2 = factor(df_corr$var2, levels = phylum_level)
df_corr$metric = factor(df_corr$metric, levels = c("Pearson", "Distance"))

p_corr = df_corr %>%
  ggplot(aes(var2, var1, fill = value)) +
  geom_tile(color = "black") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "grey",
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name = NULL) +
  scale_x_discrete(drop = FALSE, labels = phylum_label) +
  scale_y_discrete(drop = FALSE, labels = phylum_label) +
  facet_grid(rows = vars(metric)) +
  geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
  labs(x = NULL, y = NULL, title = NULL) +
  guides(fill = guide_colorbar(barwidth = 1, barheight = 7, 
                               title.position = "right")) +
  theme_bw() +
  geom_vline(xintercept = c(4.5, 9.5), color = "blue", linetype = "dashed") +
  geom_hline(yintercept = c(4.5, 9.5), color = "blue", linetype = "dashed") +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1, 
                                   face = "italic", color = txt_color),
        axis.text.y = element_text(size = 12, face = "italic", color = txt_color),
        strip.text.x = element_text(size = 14),
        strip.text.y = element_text(size = 14),
        legend.text = element_text(size = 12),
        plot.title = element_text(hjust = 0.5, size = 15),
        panel.grid.major = element_blank(),
        axis.ticks = element_blank()) +
  coord_fixed()
p_corr

Mean abundances

y_hat = read_rds("../data/nomic/phylum/y_hat.rds")

df_abn = y_hat %>%
  mutate_if(is.numeric, ~replace_na(., 0)) %>%
  mutate(ave = rowMeans(across(where(is.numeric)))) %>%
  select(phylum, day, ave) %>%
  filter(phylum != "Others")

df_abn$phylum = factor(df_abn$phylum)
df_abn$day = factor(df_abn$day, levels = c("30", "120", "365"))

p_abn = df_abn %>%
  ggplot(aes(x = day, y = ave, fill = phylum)) +
  geom_bar(position = "dodge", stat = "identity") +
  labs(x = "Day", y = "Abundance") +
  scale_fill_brewer(name = NULL, palette = "Set1", drop = FALSE) +
  scale_x_discrete(drop = FALSE) +
  facet_wrap(.~phylum, nrow = 2, scales = "free") +
  theme_bw()
p_abn

Mean relative abundance

y_hat = y_hat %>%
  mutate_if(is.numeric, ~replace_na(., 0)) %>%
  mutate(total = rowSums(across(where(is.numeric))))

df_rel = y_hat %>%
  filter(day == "30") %>%
  mutate(prop = total/sum(total)) %>%
  select(phylum, day, prop) %>%
  bind_rows(
    y_hat %>%
      filter(day == "120") %>%
      mutate(prop = total/sum(total)) %>%
      select(phylum, day, prop)
  ) %>%
  bind_rows(
    y_hat %>%
      filter(day == "365") %>%
      mutate(prop = total/sum(total)) %>%
      select(phylum, day, prop)
  )

df_rel$phylum = factor(df_rel$phylum)
df_rel$phylum = forcats::fct_relevel(df_rel$phylum, "Others", after = Inf)
df_rel$day = factor(df_rel$day, levels = c("30", "120", "365"))
p_rel = df_rel %>%
  ggplot(aes(x = day, y = prop, fill = phylum)) +
  geom_bar(position = "stack", stat = "identity") +
  labs(x = "Day", y = "Relative Abundance") +
  scale_fill_brewer(name = NULL, palette = "Set1") +
  theme_bw()
p_rel

df_rel = df_rel %>%
  mutate(prop = round(prop, 2)) %>%
  pivot_wider(names_from = "day", values_from = "prop") %>%
  arrange(phylum)
datatable(df_rel)
# Combine abundance and relative abundance plots
p_abn = p_abn + 
  theme(legend.position = "none") +
  labs(x = NULL)
p_combine = ggarrange(p_abn, p_rel, nrow = 2,
                      heights = c(1, 1.5), labels = c("a", "b"))

ggsave(plot = p_combine, "../images/supp/supp_nomic_count_phylum.pdf", 
       height = 8, width = 10)   
ggsave(plot = p_combine, "../images/supp/supp_nomic_count_phylum.jpeg", 
       height = 8, width = 10, dpi = 300)

4. SECOM results: family level

Run SECOM

pseqs = list(data1 = c(otu_data30, core_family30),
             data2 = c(otu_data120, core_family120),
             data3 = c(otu_data365, core_family365))
pseudo = 0; prv_cut = 0.1; lib_cut = 0; corr_cut = 0.5
wins_quant = c(0.05, 0.95); method = "pearson"; soft = FALSE; thresh_len = 20
n_cv = 10; thresh_hard = 0; max_p_linear = 0.1; n_cl = 2

set.seed(123)
res_linear = secom_linear(pseqs, pseudo, prv_cut, lib_cut, corr_cut, 
                          wins_quant, method, soft, thresh_len, n_cv, 
                          thresh_hard, max_p_linear, n_cl)

R = 1000; max_p_dist = 0.1
set.seed(123)
res_dist = secom_dist(pseqs, pseudo, prv_cut, lib_cut, corr_cut, 
                      wins_quant, R, thresh_hard, max_p_dist, n_cl)

saveRDS(res_linear, file = "../data/nomic/family/res_linear.rds")
saveRDS(res_dist, file = "../data/nomic/family/res_dist.rds")

Get bias-corrected abundances

# Day 30
# Bias
s_30 = res_linear$s_diff_hat$data1

# Raw abundances
raw_30 = abundances(family_data30)
raw_30 = raw_30[, names(s_30)]

# Corrected abundances
o = log(raw_30)
o[is.infinite(o)] = NA
d = nrow(o)
y_hat_30 = o - rowMeans(o, na.rm = TRUE) - t(replicate(d, s_30))
y_hat_30 = exp(y_hat_30)

# Day 120
# Bias
s_120 = res_linear$s_diff_hat$data2

# Raw abundances
raw_120 = abundances(family_data120)
raw_120 = raw_120[, names(s_120)]

# Corrected abundances
o = log(raw_120)
o[is.infinite(o)] = NA
d = nrow(o)
y_hat_120 = o - rowMeans(o, na.rm = TRUE) - t(replicate(d, s_120))
y_hat_120 = exp(y_hat_120)

# Day 365
# Bias
s_365 = res_linear$s_diff_hat$data3

# Raw abundances
raw_365 = abundances(family_data365)
raw_365 = raw_365[, names(s_365)]

# Corrected abundances
o = log(raw_365)
o[is.infinite(o)] = NA
d = nrow(o)
y_hat_365 = o - rowMeans(o, na.rm = TRUE) - t(replicate(d, s_365))
y_hat_365 = exp(y_hat_365)

family_keep = rownames(res_linear$y_hat)
family_keep_30 = family_keep[grepl("data1", family_keep)]
family_keep_30 = sort(unique(gsub("data1 - ", "", family_keep_30)))
idx = which(rownames(y_hat_30) %in% family_keep_30)
y_hat_30 = as.data.frame(y_hat_30[idx, ]) %>%
  bind_rows(as.data.frame(t(data.frame(Others = colSums(y_hat_30[-idx, ], na.rm = TRUE)))))

family_keep_120 = family_keep[grepl("data2", family_keep)]
family_keep_120 = sort(unique(gsub("data2 - ", "", family_keep_120)))
idx = which(rownames(y_hat_120) %in% family_keep_120)
y_hat_120 = as.data.frame(y_hat_120[idx, ]) %>%
  bind_rows(as.data.frame(t(data.frame(Others = colSums(y_hat_120[-idx, ], na.rm = TRUE)))))

family_keep_365 = family_keep[grepl("data3", family_keep)]
family_keep_365 = sort(unique(gsub("data3 - ", "", family_keep_365)))
idx = which(rownames(y_hat_365) %in% family_keep_365)
y_hat_365 = as.data.frame(y_hat_365[idx, ]) %>%
  bind_rows(as.data.frame(t(data.frame(Others = colSums(y_hat_365[-idx, ], na.rm = TRUE)))))

y_hat = y_hat_30 %>%
  rownames_to_column("family") %>%
  mutate(day = "30") %>%
  select(family, day, everything()) %>%
  bind_rows(
    y_hat_120 %>%
      rownames_to_column("family") %>%
      mutate(day = "120") %>%
      select(family, day, everything())
  ) %>%
  bind_rows(
    y_hat_365 %>%
      rownames_to_column("family") %>%
      mutate(day = "365") %>%
      select(family, day, everything())
  )
saveRDS(y_hat, file = "../data/nomic/family/y_hat.rds")

Visualization

Correlation coefficients

res_linear = read_rds("../data/nomic/family/res_linear.rds")
res_dist = read_rds("../data/nomic/family/res_dist.rds")

# Data organization
# Linear relationships
corr_linear = res_linear$corr_fl
cooccur_linear = res_linear$mat_cooccur

# Filter by co-occurrence
overlap = 10
corr_linear[cooccur_linear < overlap] = 0

df_linear = data.frame(get_upper_tri(corr_linear)) %>%
  rownames_to_column("var1") %>%
  pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
  filter(!is.na(value)) %>%
  mutate(var2 = gsub("\\...", " - ", var2),
         value = round(value, 2),
         metric = "Pearson")

# Distance relationships
corr_dist = res_dist$dcorr_fl
cooccur_dist = res_dist$mat_cooccur

# Filter by co-occurrence
overlap = 10
corr_dist[cooccur_dist < overlap] = 0

df_dist = data.frame(get_upper_tri(corr_dist)) %>%
  rownames_to_column("var1") %>%
  pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
  filter(!is.na(value)) %>%
  mutate(var2 = gsub("\\...", " - ", var2),
         value = round(value, 2),
         metric = "Distance")

# Merge datasets
df_corr = df_linear %>%
  bind_rows(
    df_dist
  ) 
family_level = sort(union(df_corr$var1, df_corr$var2))
family_label = sapply(family_level, function(x) strsplit(x, " - ")[[1]][2])
txt_color = case_when(grepl("data1", family_level) ~ "#1B9E77",
                      grepl("data2", family_level) ~ "#D95F02",
                      TRUE ~ "#7570B3")
df_corr$var1 = factor(df_corr$var1, levels = family_level)
df_corr$var2 = factor(df_corr$var2, levels = family_level)
df_corr$metric = factor(df_corr$metric, levels = c("Pearson", "Distance"))

p_corr = df_corr %>%
  ggplot(aes(var2, var1, fill = value)) +
  geom_tile(color = "black") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "grey",
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name = NULL) +
  scale_x_discrete(drop = FALSE, labels = family_label) +
  scale_y_discrete(drop = FALSE, labels = family_label) +
  facet_grid(rows = vars(metric)) +
  geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
  labs(x = NULL, y = NULL, title = NULL) +
  guides(fill = guide_colorbar(barwidth = 1, barheight = 7, 
                               title.position = "right")) +
  theme_bw() +
  geom_vline(xintercept = c(8.5, 15.5), color = "blue", linetype = "dashed") +
  geom_hline(yintercept = c(8.5, 15.5), color = "blue", linetype = "dashed") +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1, 
                                   face = "italic", color = txt_color),
        axis.text.y = element_text(size = 12, face = "italic", color = txt_color),
        strip.text.x = element_text(size = 14),
        strip.text.y = element_text(size = 14),
        legend.text = element_text(size = 12),
        plot.title = element_text(hjust = 0.5, size = 15),
        panel.grid.major = element_blank(),
        axis.ticks = element_blank()) +
  coord_fixed()
p_corr

ggsave(plot = p_corr, "../images/main/nomic_corr_family.pdf", height = 18, width = 16)   
ggsave(plot = p_corr, "../images/main/nomic_corr_family.jpeg", height = 18, width = 16, dpi = 300)

Mean abundances

y_hat = read_rds("../data/nomic/family/y_hat.rds")

df_abn = y_hat %>%
  mutate_if(is.numeric, ~replace_na(., 0)) %>%
  mutate(ave = rowMeans(across(where(is.numeric)))) %>%
  select(family, day, ave) %>%
  filter(family != "Others")

df_abn$family = factor(df_abn$family)
df_abn$day = factor(df_abn$day, levels = c("30", "120", "365"))

p_abn = df_abn %>%
  ggplot(aes(x = day, y = ave, fill = family)) +
  geom_bar(position = "dodge", stat = "identity") +
  labs(x = "Day", y = "Abundance") +
  scale_fill_brewer(name = NULL, palette = "Paired", drop = FALSE) +
  scale_x_discrete(drop = FALSE) +
  facet_wrap(.~family, nrow = 2, scales = "free") +
  theme_bw()
p_abn

Mean relative abundance

y_hat = y_hat %>%
  mutate_if(is.numeric, ~replace_na(., 0)) %>%
  mutate(total = rowSums(across(where(is.numeric))))

df_rel = y_hat %>%
  filter(day == "30") %>%
  mutate(prop = total/sum(total)) %>%
  select(family, day, prop) %>%
  bind_rows(
    y_hat %>%
      filter(day == "120") %>%
      mutate(prop = total/sum(total)) %>%
      select(family, day, prop)
  ) %>%
  bind_rows(
    y_hat %>%
      filter(day == "365") %>%
      mutate(prop = total/sum(total)) %>%
      select(family, day, prop)
  )

df_rel$family = factor(df_rel$family)
df_rel$family = forcats::fct_relevel(df_rel$family, "Others", after = Inf)
df_rel$day = factor(df_rel$day, levels = c("30", "120", "365"))
p_rel = df_rel %>%
  ggplot(aes(x = day, y = prop, fill = family)) +
  geom_bar(position = "stack", stat = "identity") +
  labs(x = "Day", y = "Relative Abundance") +
  scale_fill_brewer(name = NULL, palette = "Paired") +
  theme_bw()
p_rel

df_rel = df_rel %>%
  mutate(prop = round(prop, 2)) %>%
  pivot_wider(names_from = "day", values_from = "prop") %>%
  arrange(family)
datatable(df_rel)
# Combine abundance and relative abundance plots
p_abn = p_abn + 
  theme(legend.position = "none") +
  labs(x = NULL)
p_combine = ggarrange(p_abn, p_rel, nrow = 2,
                      heights = c(1, 1.5), labels = c("a", "b"))

ggsave(plot = p_combine, "../images/supp/supp_nomic_count_family.pdf", 
       height = 8, width = 10)   
ggsave(plot = p_combine, "../images/supp/supp_nomic_count_family.jpeg", 
       height = 8, width = 10, dpi = 300)

Nonlinearity example

Enterobacteriaceae vs. Ruminococcaceae at day 120

df_eg = data.frame(x = res_linear$y_hat["data2 - Enterobacteriaceae", ],
                   y = res_linear$y_hat["data2 - Ruminococcaceae", ])
df_eg = df_eg[complete.cases(df_eg), ]

rlm1 = lmrob(y ~ x, data = df_eg, control = lmrob.control(max.it = 100))
rlm2 = lmrob(y ~ poly(x, 4), data = df_eg, control = lmrob.control(max.it = 100))
r_squared1 = round(summary(rlm1)$adj.r.squared, 2)
r_squared2 = round(summary(rlm2)$adj.r.squared, 2)

df_ann = data.frame(x = c(0.75, 0.75), y = c(1.75, 1.25)) %>%
  mutate(label = c(paste0("R^{2}~(linear) == ",  r_squared1),
                   paste0("R^{2}~(Poly) == ",  r_squared2)))

scatter_eg = df_eg %>%
  ggplot(aes(x = x, y = y)) +
  geom_point(size = 4) +
  geom_text(data = df_ann, aes(x = x, y = y, label = label), hjust = 0, 
            color = "orange", size = 4, fontface = "bold", parse = TRUE) +
  labs(x = "Enterobacteriaceae", y = "Ruminococcaceae") +
  geom_smooth(method = "lm", formula = y ~ poly(x, 4), se = FALSE, linetype = "dashed") + 
  theme_bw()
  
scatter_eg

ggsave(plot = scatter_eg, "../images/supp/supp_nonlinear_example2.pdf", height = 5, width = 6.25)
ggsave(plot = scatter_eg, "../images/supp/supp_nonlinear_example2.jpeg", height = 5, width = 6.25, dpi = 300)

5. SparCC results

Run SparCC

raw_30 = abundances(family_data30)
rownames(raw_30) = paste0("data1 - ", rownames(raw_30))

raw_120 = abundances(family_data120)
rownames(raw_120) = paste0("data2 - ", rownames(raw_120))

raw_365 = abundances(family_data365)
rownames(raw_365) = paste0("data3 - ", rownames(raw_365))

raw_30_120 = data.frame(raw_30, check.names = FALSE) %>%
  bind_rows(data.frame(raw_120, check.names = FALSE)) %>%
  select_if(~ !any(is.na(.)))

raw_30_365 = data.frame(raw_30, check.names = FALSE) %>%
  bind_rows(data.frame(raw_365, check.names = FALSE)) %>%
  select_if(~ !any(is.na(.)))

raw_120_365 = data.frame(raw_120, check.names = FALSE) %>%
  bind_rows(data.frame(raw_365, check.names = FALSE)) %>%
  select_if(~ !any(is.na(.)))

# SparCC cannot handle missing values, so we have to calculate the correlation
# matrices separately 
set.seed(123)
R_hat_sparcc11 = sparcc(t(raw_30))$Cor
R_hat_sparcc22 = sparcc(t(raw_120))$Cor
R_hat_sparcc33 = sparcc(t(raw_365))$Cor

R_hat_sparcc12 = sparcc(t(raw_30_120))$Cor
R_hat_sparcc12 = R_hat_sparcc12[seq_len(nrow(raw_30)), 
                                seq(from = nrow(raw_30) + 1, 
                                    to = nrow(raw_30_120))]
R_hat_sparcc13 = sparcc(t(raw_30_365))$Cor
R_hat_sparcc13 = R_hat_sparcc13[seq_len(nrow(raw_30)), 
                                seq(from = nrow(raw_30) + 1, 
                                    to = nrow(raw_30_365))]
R_hat_sparcc23 = sparcc(t(raw_120_365))$Cor
R_hat_sparcc23 = R_hat_sparcc23[seq_len(nrow(raw_120)), 
                                seq(from = nrow(raw_120) + 1, 
                                    to = nrow(raw_120_365))]

R_hat_sparcc = matrix(NA, 
                      nrow = nrow(raw_30) + nrow(raw_120) + nrow(raw_365),
                      ncol = nrow(raw_30) + nrow(raw_120) + nrow(raw_365))
R_hat_sparcc[seq_len(nrow(raw_30)), seq_len(nrow(raw_30))] = R_hat_sparcc11
R_hat_sparcc[seq_len(nrow(raw_30)), 
             seq(from = nrow(raw_30) + 1, 
                 to = nrow(raw_30) + nrow(raw_120))] = R_hat_sparcc12
R_hat_sparcc[seq_len(nrow(raw_30)), 
             seq(from = nrow(raw_30) + nrow(raw_120) + 1, 
                 to = nrow(raw_30) + nrow(raw_120) + nrow(raw_365))] = R_hat_sparcc13
R_hat_sparcc[seq(from = nrow(raw_30) + 1, 
                 to = nrow(raw_30) + nrow(raw_120)), 
             seq(from = nrow(raw_30) + 1, 
                 to = nrow(raw_30) + nrow(raw_120))] = R_hat_sparcc22
R_hat_sparcc[seq(from = nrow(raw_30) + 1, 
                 to = nrow(raw_30) + nrow(raw_120)), 
             seq(from = nrow(raw_30) + nrow(raw_120) + 1, 
                 to = nrow(raw_30) + nrow(raw_120) + nrow(raw_365))] = R_hat_sparcc23
R_hat_sparcc[seq(from = nrow(raw_30) + nrow(raw_120) + 1, 
                 to = nrow(raw_30) + nrow(raw_120) + nrow(raw_365)), 
             seq(from = nrow(raw_30) + nrow(raw_120) + 1, 
                 to = nrow(raw_30) + nrow(raw_120) + nrow(raw_365))] = R_hat_sparcc33
R_hat_sparcc[lower.tri(R_hat_sparcc)] = R_hat_sparcc[upper.tri(R_hat_sparcc)]

dimnames(R_hat_sparcc) = list(c(rownames(raw_30), rownames(raw_120), rownames(raw_365)), 
                              c(rownames(raw_30), rownames(raw_120), rownames(raw_365)))

saveRDS(R_hat_sparcc, file = "../data/nomic/sparcc/R_hat_sparcc.rds")

Visualization

R_hat_sparcc = read_rds("../data/nomic/sparcc/R_hat_sparcc.rds")
R_hat_sparcc = hard_thresh(R_hat_sparcc, 0.3)
col_ind = match(family_level, colnames(R_hat_sparcc))
R_hat_sparcc = R_hat_sparcc[col_ind[!is.na(col_ind)], col_ind[!is.na(col_ind)]]

df_sparcc = data.frame(get_upper_tri(R_hat_sparcc)) %>%
  rownames_to_column("var1") %>%
  pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
  mutate(var2 = gsub("\\...", " - ", var2)) %>%
  filter(!is.na(value)) %>%
  mutate(value = round(value, 2))

family_level = sort(union(df_sparcc$var1, df_sparcc$var2))
family_label = sapply(family_level, function(x) strsplit(x, " - ")[[1]][2])
txt_color = case_when(grepl("data1", family_level) ~ "#1B9E77",
                      grepl("data2", family_level) ~ "#D95F02",
                      TRUE ~ "#7570B3")
df_sparcc$var1 = factor(df_sparcc$var1, levels = family_level)
df_sparcc$var2 = factor(df_sparcc$var2, levels = family_level)

p_sparcc = df_sparcc %>%
  ggplot(aes(var2, var1, fill = value)) +
  geom_tile(color = "black") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "grey",
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name = NULL) +
  scale_x_discrete(drop = FALSE, labels = family_label) +
  scale_y_discrete(drop = FALSE, labels = family_label) +
  geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
  labs(x = NULL, y = NULL, title = NULL) +
  guides(fill = guide_colorbar(barwidth = 1, barheight = 7, 
                               title.position = "right")) +
  theme_bw() +
  geom_vline(xintercept = c(8.5, 15.5), color = "blue", linetype = "dashed") +
  geom_hline(yintercept = c(8.5, 15.5), color = "blue", linetype = "dashed") +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1, 
                                   face = "italic", color = txt_color),
        axis.text.y = element_text(size = 12, face = "italic", color = txt_color),
        strip.text.x = element_text(size = 14),
        strip.text.y = element_text(size = 14),
        legend.text = element_text(size = 12),
        plot.title = element_text(hjust = 0.5, size = 15),
        panel.grid.major = element_blank(),
        axis.ticks = element_blank()) +
  coord_fixed()
p_sparcc

ggsave(plot = p_sparcc, "../images/supp/supp_nomic_sparcc.pdf", height = 10, width = 13)
ggsave(plot = p_sparcc, "../images/supp/supp_nomic_sparcc.jpeg", height = 10, width = 13, dpi = 300)

Session information

sessionInfo()
R version 4.2.0 (2022-04-22)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Big Sur/Monterey 10.16

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] DT_0.23             robustbase_0.95-0   openxlsx_4.2.5     
 [4] SpiecEasi_1.1.2     ggpubr_0.4.0        randomcoloR_1.1.0.1
 [7] ANCOMBC_1.6.2       microbiome_1.18.0   phyloseq_1.40.0    
[10] forcats_0.5.1       stringr_1.4.0       dplyr_1.0.9        
[13] purrr_0.3.4         readr_2.1.2         tidyr_1.2.0        
[16] tibble_3.1.7        ggplot2_3.3.6       tidyverse_1.3.1    

loaded via a namespace (and not attached):
  [1] readxl_1.4.0           backports_1.4.1        Hmisc_4.7-0           
  [4] VGAM_1.1-6             plyr_1.8.7             igraph_1.3.2          
  [7] splines_4.2.0          crosstalk_1.2.0        GenomeInfoDb_1.32.2   
 [10] digest_0.6.29          foreach_1.5.2          htmltools_0.5.2       
 [13] fansi_1.0.3            magrittr_2.0.3         checkmate_2.1.0       
 [16] cluster_2.1.3          doParallel_1.0.17      tzdb_0.3.0            
 [19] Biostrings_2.64.0      modelr_0.1.8           jpeg_0.1-9            
 [22] colorspace_2.0-3       rvest_1.0.2            haven_2.5.0           
 [25] rbibutils_2.2.8        xfun_0.31              crayon_1.5.1          
 [28] RCurl_1.98-1.7         jsonlite_1.8.0         Exact_3.1             
 [31] survival_3.3-1         iterators_1.0.14       ape_5.6-2             
 [34] glue_1.6.2             gtable_0.3.0           zlibbioc_1.42.0       
 [37] XVector_0.36.0         V8_4.2.0               car_3.1-0             
 [40] Rhdf5lib_1.18.2        shape_1.4.6            DEoptimR_1.0-11       
 [43] BiocGenerics_0.42.0    abind_1.4-5            scales_1.2.0          
 [46] mvtnorm_1.1-3          DBI_1.1.3              rngtools_1.5.2        
 [49] rstatix_0.7.0          Rcpp_1.0.8.3           htmlTable_2.4.0       
 [52] foreign_0.8-82         proxy_0.4-27           Formula_1.2-4         
 [55] stats4_4.2.0           glmnet_4.1-4           htmlwidgets_1.5.4     
 [58] httr_1.4.3             pulsar_0.3.7           RColorBrewer_1.1-3    
 [61] ellipsis_0.3.2         farver_2.1.0           pkgconfig_2.0.3       
 [64] nnet_7.3-17            sass_0.4.1             dbplyr_2.2.0          
 [67] utf8_1.2.2             labeling_0.4.2         tidyselect_1.1.2      
 [70] rlang_1.0.2            reshape2_1.4.4         munsell_0.5.0         
 [73] cellranger_1.1.0       tools_4.2.0            cli_3.3.0             
 [76] generics_0.1.2         ade4_1.7-19            broom_0.8.0           
 [79] evaluate_0.15          biomformat_1.24.0      fastmap_1.1.0         
 [82] yaml_2.3.5             knitr_1.39             fs_1.5.2              
 [85] zip_2.2.0              rootSolve_1.8.2.3      nlme_3.1-158          
 [88] doRNG_1.8.2            xml2_1.3.3             compiler_4.2.0        
 [91] rstudioapi_0.13        curl_4.3.2             png_0.1-7             
 [94] ggsignif_0.6.3         e1071_1.7-11           huge_1.3.5            
 [97] reprex_2.0.1           bslib_0.3.1            DescTools_0.99.45     
[100] stringi_1.7.6          gsl_2.1-7.1            highr_0.9             
[103] lattice_0.20-45        Matrix_1.4-1           nloptr_2.0.3          
[106] vegan_2.6-2            permute_0.9-7          multtest_2.52.0       
[109] vctrs_0.4.1            pillar_1.7.0           lifecycle_1.0.1       
[112] rhdf5filters_1.8.0     Rdpack_2.3.1           jquerylib_0.1.4       
[115] cowplot_1.1.1          data.table_1.14.2      bitops_1.0-7          
[118] lmom_2.9               R6_2.5.1               latticeExtra_0.6-29   
[121] gridExtra_2.3          IRanges_2.30.0         gld_2.6.4             
[124] codetools_0.2-18       boot_1.3-28            energy_1.7-10         
[127] MASS_7.3-57            assertthat_0.2.1       rhdf5_2.40.0          
[130] withr_2.5.0            S4Vectors_0.34.0       GenomeInfoDbData_1.2.8
[133] mgcv_1.8-40            expm_0.999-6           parallel_4.2.0        
[136] hms_1.1.1              grid_4.2.0             rpart_4.1.16          
[139] class_7.3-20           rmarkdown_2.14         carData_3.0-5         
[142] Rtsne_0.16             Biobase_2.56.0         lubridate_1.8.0       
[145] base64enc_0.1-3